library(data.table)
library(dplyr)
library(lubridate)
library(leaflet)
library(pals)
library(ggplot2)
aqi2004 <- data.table::fread('C:/Users/Brandyn Ruiz/OneDrive/USC/PM566/Assignment1/aqi2004.csv')
dim(aqi2004)
## [1] 19233    20
names(aqi2004)
##  [1] "Date"                           "Source"                        
##  [3] "Site ID"                        "POC"                           
##  [5] "Daily Mean PM2.5 Concentration" "UNITS"                         
##  [7] "DAILY_AQI_VALUE"                "Site Name"                     
##  [9] "DAILY_OBS_COUNT"                "PERCENT_COMPLETE"              
## [11] "AQS_PARAMETER_CODE"             "AQS_PARAMETER_DESC"            
## [13] "CBSA_CODE"                      "CBSA_NAME"                     
## [15] "STATE_CODE"                     "STATE"                         
## [17] "COUNTY_CODE"                    "COUNTY"                        
## [19] "SITE_LATITUDE"                  "SITE_LONGITUDE"
str(aqi2004)
## Classes 'data.table' and 'data.frame':   19233 obs. of  20 variables:
##  $ Date                          : chr  "01/01/2004" "01/02/2004" "01/03/2004" "01/04/2004" ...
##  $ Source                        : chr  "AQS" "AQS" "AQS" "AQS" ...
##  $ Site ID                       : int  60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 ...
##  $ POC                           : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Daily Mean PM2.5 Concentration: num  8.9 12.2 16.5 19.5 11.5 32.5 15.5 29.9 21 16.9 ...
##  $ UNITS                         : chr  "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" ...
##  $ DAILY_AQI_VALUE               : int  37 51 60 67 48 94 58 88 70 61 ...
##  $ Site Name                     : chr  "Livermore" "Livermore" "Livermore" "Livermore" ...
##  $ DAILY_OBS_COUNT               : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ PERCENT_COMPLETE              : num  100 100 100 100 100 100 100 100 100 100 ...
##  $ AQS_PARAMETER_CODE            : int  88101 88502 88502 88502 88502 88502 88502 88502 88502 88502 ...
##  $ AQS_PARAMETER_DESC            : chr  "PM2.5 - Local Conditions" "Acceptable PM2.5 AQI & Speciation Mass" "Acceptable PM2.5 AQI & Speciation Mass" "Acceptable PM2.5 AQI & Speciation Mass" ...
##  $ CBSA_CODE                     : int  41860 41860 41860 41860 41860 41860 41860 41860 41860 41860 ...
##  $ CBSA_NAME                     : chr  "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" ...
##  $ STATE_CODE                    : int  6 6 6 6 6 6 6 6 6 6 ...
##  $ STATE                         : chr  "California" "California" "California" "California" ...
##  $ COUNTY_CODE                   : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ COUNTY                        : chr  "Alameda" "Alameda" "Alameda" "Alameda" ...
##  $ SITE_LATITUDE                 : num  37.7 37.7 37.7 37.7 37.7 ...
##  $ SITE_LONGITUDE                : num  -122 -122 -122 -122 -122 ...
##  - attr(*, ".internal.selfref")=<externalptr>
head(aqi2004)
##          Date Source  Site ID POC Daily Mean PM2.5 Concentration    UNITS
## 1: 01/01/2004    AQS 60010007   1                            8.9 ug/m3 LC
## 2: 01/02/2004    AQS 60010007   1                           12.2 ug/m3 LC
## 3: 01/03/2004    AQS 60010007   1                           16.5 ug/m3 LC
## 4: 01/04/2004    AQS 60010007   1                           19.5 ug/m3 LC
## 5: 01/05/2004    AQS 60010007   1                           11.5 ug/m3 LC
## 6: 01/06/2004    AQS 60010007   1                           32.5 ug/m3 LC
##    DAILY_AQI_VALUE Site Name DAILY_OBS_COUNT PERCENT_COMPLETE
## 1:              37 Livermore               1              100
## 2:              51 Livermore               1              100
## 3:              60 Livermore               1              100
## 4:              67 Livermore               1              100
## 5:              48 Livermore               1              100
## 6:              94 Livermore               1              100
##    AQS_PARAMETER_CODE                     AQS_PARAMETER_DESC CBSA_CODE
## 1:              88101               PM2.5 - Local Conditions     41860
## 2:              88502 Acceptable PM2.5 AQI & Speciation Mass     41860
## 3:              88502 Acceptable PM2.5 AQI & Speciation Mass     41860
## 4:              88502 Acceptable PM2.5 AQI & Speciation Mass     41860
## 5:              88502 Acceptable PM2.5 AQI & Speciation Mass     41860
## 6:              88502 Acceptable PM2.5 AQI & Speciation Mass     41860
##                            CBSA_NAME STATE_CODE      STATE COUNTY_CODE  COUNTY
## 1: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
## 2: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
## 3: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
## 4: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
## 5: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
## 6: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
##    SITE_LATITUDE SITE_LONGITUDE
## 1:      37.68753      -121.7842
## 2:      37.68753      -121.7842
## 3:      37.68753      -121.7842
## 4:      37.68753      -121.7842
## 5:      37.68753      -121.7842
## 6:      37.68753      -121.7842
tail(aqi2004)
##          Date Source  Site ID POC Daily Mean PM2.5 Concentration    UNITS
## 1: 12/14/2004    AQS 61131003   1                             11 ug/m3 LC
## 2: 12/17/2004    AQS 61131003   1                             16 ug/m3 LC
## 3: 12/20/2004    AQS 61131003   1                             17 ug/m3 LC
## 4: 12/23/2004    AQS 61131003   1                              9 ug/m3 LC
## 5: 12/26/2004    AQS 61131003   1                             24 ug/m3 LC
## 6: 12/29/2004    AQS 61131003   1                              9 ug/m3 LC
##    DAILY_AQI_VALUE            Site Name DAILY_OBS_COUNT PERCENT_COMPLETE
## 1:              46 Woodland-Gibson Road               1              100
## 2:              59 Woodland-Gibson Road               1              100
## 3:              61 Woodland-Gibson Road               1              100
## 4:              38 Woodland-Gibson Road               1              100
## 5:              76 Woodland-Gibson Road               1              100
## 6:              38 Woodland-Gibson Road               1              100
##    AQS_PARAMETER_CODE       AQS_PARAMETER_DESC CBSA_CODE
## 1:              88101 PM2.5 - Local Conditions     40900
## 2:              88101 PM2.5 - Local Conditions     40900
## 3:              88101 PM2.5 - Local Conditions     40900
## 4:              88101 PM2.5 - Local Conditions     40900
## 5:              88101 PM2.5 - Local Conditions     40900
## 6:              88101 PM2.5 - Local Conditions     40900
##                                  CBSA_NAME STATE_CODE      STATE COUNTY_CODE
## 1: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
## 2: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
## 3: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
## 4: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
## 5: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
## 6: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
##    COUNTY SITE_LATITUDE SITE_LONGITUDE
## 1:   Yolo      38.66121      -121.7327
## 2:   Yolo      38.66121      -121.7327
## 3:   Yolo      38.66121      -121.7327
## 4:   Yolo      38.66121      -121.7327
## 5:   Yolo      38.66121      -121.7327
## 6:   Yolo      38.66121      -121.7327

In our 2004 dataset we have 19233 observations with 20 variables.

aqi2019 <- data.table::fread('C:/Users/Brandyn Ruiz/OneDrive/USC/PM566/Assignment1/aqi2019.csv')
dim(aqi2019)
## [1] 53328    20
names(aqi2019)
##  [1] "Date"                           "Source"                        
##  [3] "Site ID"                        "POC"                           
##  [5] "Daily Mean PM2.5 Concentration" "UNITS"                         
##  [7] "DAILY_AQI_VALUE"                "Site Name"                     
##  [9] "DAILY_OBS_COUNT"                "PERCENT_COMPLETE"              
## [11] "AQS_PARAMETER_CODE"             "AQS_PARAMETER_DESC"            
## [13] "CBSA_CODE"                      "CBSA_NAME"                     
## [15] "STATE_CODE"                     "STATE"                         
## [17] "COUNTY_CODE"                    "COUNTY"                        
## [19] "SITE_LATITUDE"                  "SITE_LONGITUDE"
str(aqi2019)
## Classes 'data.table' and 'data.frame':   53328 obs. of  20 variables:
##  $ Date                          : chr  "01/01/2019" "01/02/2019" "01/03/2019" "01/04/2019" ...
##  $ Source                        : chr  "AQS" "AQS" "AQS" "AQS" ...
##  $ Site ID                       : int  60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 ...
##  $ POC                           : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ Daily Mean PM2.5 Concentration: num  5.7 11.9 20.1 28.8 11.2 2.7 2.8 7 3.1 7.1 ...
##  $ UNITS                         : chr  "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" ...
##  $ DAILY_AQI_VALUE               : int  24 50 68 86 47 11 12 29 13 30 ...
##  $ Site Name                     : chr  "Livermore" "Livermore" "Livermore" "Livermore" ...
##  $ DAILY_OBS_COUNT               : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ PERCENT_COMPLETE              : num  100 100 100 100 100 100 100 100 100 100 ...
##  $ AQS_PARAMETER_CODE            : int  88101 88101 88101 88101 88101 88101 88101 88101 88101 88101 ...
##  $ AQS_PARAMETER_DESC            : chr  "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" ...
##  $ CBSA_CODE                     : int  41860 41860 41860 41860 41860 41860 41860 41860 41860 41860 ...
##  $ CBSA_NAME                     : chr  "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" ...
##  $ STATE_CODE                    : int  6 6 6 6 6 6 6 6 6 6 ...
##  $ STATE                         : chr  "California" "California" "California" "California" ...
##  $ COUNTY_CODE                   : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ COUNTY                        : chr  "Alameda" "Alameda" "Alameda" "Alameda" ...
##  $ SITE_LATITUDE                 : num  37.7 37.7 37.7 37.7 37.7 ...
##  $ SITE_LONGITUDE                : num  -122 -122 -122 -122 -122 ...
##  - attr(*, ".internal.selfref")=<externalptr>
head(aqi2019)
##          Date Source  Site ID POC Daily Mean PM2.5 Concentration    UNITS
## 1: 01/01/2019    AQS 60010007   3                            5.7 ug/m3 LC
## 2: 01/02/2019    AQS 60010007   3                           11.9 ug/m3 LC
## 3: 01/03/2019    AQS 60010007   3                           20.1 ug/m3 LC
## 4: 01/04/2019    AQS 60010007   3                           28.8 ug/m3 LC
## 5: 01/05/2019    AQS 60010007   3                           11.2 ug/m3 LC
## 6: 01/06/2019    AQS 60010007   3                            2.7 ug/m3 LC
##    DAILY_AQI_VALUE Site Name DAILY_OBS_COUNT PERCENT_COMPLETE
## 1:              24 Livermore               1              100
## 2:              50 Livermore               1              100
## 3:              68 Livermore               1              100
## 4:              86 Livermore               1              100
## 5:              47 Livermore               1              100
## 6:              11 Livermore               1              100
##    AQS_PARAMETER_CODE       AQS_PARAMETER_DESC CBSA_CODE
## 1:              88101 PM2.5 - Local Conditions     41860
## 2:              88101 PM2.5 - Local Conditions     41860
## 3:              88101 PM2.5 - Local Conditions     41860
## 4:              88101 PM2.5 - Local Conditions     41860
## 5:              88101 PM2.5 - Local Conditions     41860
## 6:              88101 PM2.5 - Local Conditions     41860
##                            CBSA_NAME STATE_CODE      STATE COUNTY_CODE  COUNTY
## 1: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
## 2: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
## 3: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
## 4: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
## 5: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
## 6: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
##    SITE_LATITUDE SITE_LONGITUDE
## 1:      37.68753      -121.7842
## 2:      37.68753      -121.7842
## 3:      37.68753      -121.7842
## 4:      37.68753      -121.7842
## 5:      37.68753      -121.7842
## 6:      37.68753      -121.7842
tail(aqi2019)
##          Date Source  Site ID POC Daily Mean PM2.5 Concentration    UNITS
## 1: 11/11/2019    AQS 61131003   1                           13.5 ug/m3 LC
## 2: 11/17/2019    AQS 61131003   1                           18.1 ug/m3 LC
## 3: 11/29/2019    AQS 61131003   1                           12.5 ug/m3 LC
## 4: 12/17/2019    AQS 61131003   1                           23.8 ug/m3 LC
## 5: 12/23/2019    AQS 61131003   1                            1.0 ug/m3 LC
## 6: 12/29/2019    AQS 61131003   1                            9.1 ug/m3 LC
##    DAILY_AQI_VALUE            Site Name DAILY_OBS_COUNT PERCENT_COMPLETE
## 1:              54 Woodland-Gibson Road               1              100
## 2:              64 Woodland-Gibson Road               1              100
## 3:              52 Woodland-Gibson Road               1              100
## 4:              76 Woodland-Gibson Road               1              100
## 5:               4 Woodland-Gibson Road               1              100
## 6:              38 Woodland-Gibson Road               1              100
##    AQS_PARAMETER_CODE       AQS_PARAMETER_DESC CBSA_CODE
## 1:              88101 PM2.5 - Local Conditions     40900
## 2:              88101 PM2.5 - Local Conditions     40900
## 3:              88101 PM2.5 - Local Conditions     40900
## 4:              88101 PM2.5 - Local Conditions     40900
## 5:              88101 PM2.5 - Local Conditions     40900
## 6:              88101 PM2.5 - Local Conditions     40900
##                                  CBSA_NAME STATE_CODE      STATE COUNTY_CODE
## 1: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
## 2: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
## 3: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
## 4: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
## 5: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
## 6: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
##    COUNTY SITE_LATITUDE SITE_LONGITUDE
## 1:   Yolo      38.66121      -121.7327
## 2:   Yolo      38.66121      -121.7327
## 3:   Yolo      38.66121      -121.7327
## 4:   Yolo      38.66121      -121.7327
## 5:   Yolo      38.66121      -121.7327
## 6:   Yolo      38.66121      -121.7327

In our 2019 dataset we have 53328 observations with 20 variables. There are more records in 2019 than there are in 2004 and could possibly be due to more sites.

join <- full_join(aqi2004, aqi2019)
head(join)
##          Date Source  Site ID POC Daily Mean PM2.5 Concentration    UNITS
## 1: 01/01/2004    AQS 60010007   1                            8.9 ug/m3 LC
## 2: 01/02/2004    AQS 60010007   1                           12.2 ug/m3 LC
## 3: 01/03/2004    AQS 60010007   1                           16.5 ug/m3 LC
## 4: 01/04/2004    AQS 60010007   1                           19.5 ug/m3 LC
## 5: 01/05/2004    AQS 60010007   1                           11.5 ug/m3 LC
## 6: 01/06/2004    AQS 60010007   1                           32.5 ug/m3 LC
##    DAILY_AQI_VALUE Site Name DAILY_OBS_COUNT PERCENT_COMPLETE
## 1:              37 Livermore               1              100
## 2:              51 Livermore               1              100
## 3:              60 Livermore               1              100
## 4:              67 Livermore               1              100
## 5:              48 Livermore               1              100
## 6:              94 Livermore               1              100
##    AQS_PARAMETER_CODE                     AQS_PARAMETER_DESC CBSA_CODE
## 1:              88101               PM2.5 - Local Conditions     41860
## 2:              88502 Acceptable PM2.5 AQI & Speciation Mass     41860
## 3:              88502 Acceptable PM2.5 AQI & Speciation Mass     41860
## 4:              88502 Acceptable PM2.5 AQI & Speciation Mass     41860
## 5:              88502 Acceptable PM2.5 AQI & Speciation Mass     41860
## 6:              88502 Acceptable PM2.5 AQI & Speciation Mass     41860
##                            CBSA_NAME STATE_CODE      STATE COUNTY_CODE  COUNTY
## 1: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
## 2: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
## 3: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
## 4: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
## 5: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
## 6: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
##    SITE_LATITUDE SITE_LONGITUDE
## 1:      37.68753      -121.7842
## 2:      37.68753      -121.7842
## 3:      37.68753      -121.7842
## 4:      37.68753      -121.7842
## 5:      37.68753      -121.7842
## 6:      37.68753      -121.7842
tail(join)
##          Date Source  Site ID POC Daily Mean PM2.5 Concentration    UNITS
## 1: 11/11/2019    AQS 61131003   1                           13.5 ug/m3 LC
## 2: 11/17/2019    AQS 61131003   1                           18.1 ug/m3 LC
## 3: 11/29/2019    AQS 61131003   1                           12.5 ug/m3 LC
## 4: 12/17/2019    AQS 61131003   1                           23.8 ug/m3 LC
## 5: 12/23/2019    AQS 61131003   1                            1.0 ug/m3 LC
## 6: 12/29/2019    AQS 61131003   1                            9.1 ug/m3 LC
##    DAILY_AQI_VALUE            Site Name DAILY_OBS_COUNT PERCENT_COMPLETE
## 1:              54 Woodland-Gibson Road               1              100
## 2:              64 Woodland-Gibson Road               1              100
## 3:              52 Woodland-Gibson Road               1              100
## 4:              76 Woodland-Gibson Road               1              100
## 5:               4 Woodland-Gibson Road               1              100
## 6:              38 Woodland-Gibson Road               1              100
##    AQS_PARAMETER_CODE       AQS_PARAMETER_DESC CBSA_CODE
## 1:              88101 PM2.5 - Local Conditions     40900
## 2:              88101 PM2.5 - Local Conditions     40900
## 3:              88101 PM2.5 - Local Conditions     40900
## 4:              88101 PM2.5 - Local Conditions     40900
## 5:              88101 PM2.5 - Local Conditions     40900
## 6:              88101 PM2.5 - Local Conditions     40900
##                                  CBSA_NAME STATE_CODE      STATE COUNTY_CODE
## 1: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
## 2: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
## 3: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
## 4: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
## 5: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
## 6: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
##    COUNTY SITE_LATITUDE SITE_LONGITUDE
## 1:   Yolo      38.66121      -121.7327
## 2:   Yolo      38.66121      -121.7327
## 3:   Yolo      38.66121      -121.7327
## 4:   Yolo      38.66121      -121.7327
## 5:   Yolo      38.66121      -121.7327
## 6:   Yolo      38.66121      -121.7327
join$Date <- as.Date(join$Date, "%m/%d/%Y")
join <- join %>%
  mutate(Year = year(join$Date))
head(join$Year)
## [1] 2004 2004 2004 2004 2004 2004
names(join)[5] <- 'PM2.5'
names(join)[8] <- 'SiteName'
names(join)[19] <- 'lat'
names(join)[20] <- 'lon'
names(join)
##  [1] "Date"               "Source"             "Site ID"           
##  [4] "POC"                "PM2.5"              "UNITS"             
##  [7] "DAILY_AQI_VALUE"    "SiteName"           "DAILY_OBS_COUNT"   
## [10] "PERCENT_COMPLETE"   "AQS_PARAMETER_CODE" "AQS_PARAMETER_DESC"
## [13] "CBSA_CODE"          "CBSA_NAME"          "STATE_CODE"        
## [16] "STATE"              "COUNTY_CODE"        "COUNTY"            
## [19] "lat"                "lon"                "Year"
pal <- colorNumeric(c('red', 'blue'), c(2004, 2019))
# pal(join$Year)

leaflet(join)%>%
  addProviderTiles('OpenStreetMap')%>%
  addCircles(lat=~lat, lng=~lon, opacity = 1, fillOpacity = 1, radius = 100, color = ~pal(Year))

There are significantly more testing sites in the year 2019. Majority of the testing sites are clustered within bigger cities throughout California, but there are still some recording sites all throughout the state as well.

sum(is.na(join$PM2.5))
## [1] 0
sum(join$PM2.5 < 0)
## [1] 293
sum(join$PM2.5 < 0) / nrow(join)
## [1] 0.004037982

From our dataset we have no missing values for the daily mean concentration of PM 2.5, but we do have 293 values that are below 0. For microparticles floating in the air a mean concentration below 0 does not make sense as the minimal value would be 0. Finding the proportion of values of PM 2.5 being reported less than 0 over the total amount of records is relatively small being less than 0.5%.

#State Level
ggplot(join, aes(STATE, PM2.5, fill = factor(Year)))+
  geom_boxplot()+
  labs(title = 'Boxplot of PM 2.5 concentration within California', x = 'State',
       fill = 'Year')

From comparing the two years within California we see that 2004 has more extraneous outliers compared to 2019 as well as a significantly higher range.

#State Level
ggplot(subset(join, Year == 2004), aes(x = Date, y = PM2.5))+
  geom_line()+
  labs(title = 'Time Series of PM 2.5 in 2004')

ggplot(subset(join, Year == 2019), aes(x = Date, y = PM2.5))+
  geom_line()+
  labs(title = 'Time Series of PM 2.5 in 2019')

In our time series visualization in the year 2004 we see that there is an unusual spike on PM 2.5 concentration within July-August reaching the highest at 250. However, the conentration tends to be in a cyclical pattern rising and falling with spikes every quarter of the year. The 2019 concentration of PM 2.5 for the state of california follows the same type of trend with cyclical patterns but with many spikes towards the last quarter of the year, with the largest spike in October reaching just above 120.

#State Level
join %>%
  group_by(Year)%>%
  summarise(min = min(PM2.5), mean = mean(PM2.5), max = max(PM2.5),
            quantile = quantile(PM2.5, c(0.25, 0.50, 0.75)), q = c(0.25, 0.50, 0.75))
## # A tibble: 6 x 6
## # Groups:   Year [2]
##    Year   min  mean   max quantile     q
##   <dbl> <dbl> <dbl> <dbl>    <dbl> <dbl>
## 1  2004  -0.1 13.1   251       6    0.25
## 2  2004  -0.1 13.1   251      10.1  0.5 
## 3  2004  -0.1 13.1   251      16.3  0.75
## 4  2019  -2.2  7.74  121.      4    0.25
## 5  2019  -2.2  7.74  121.      6.5  0.5 
## 6  2019  -2.2  7.74  121.      9.9  0.75

From the state level in California we see that in 2004 there are much higher concnetrations of PM 2.5 compared to the PM 2.5 concentrations in the year 2019. As the range of concentrations is significantly higher in 2004 at a little over 10 in PM 2.5 at the 3rd quantile compared to 2019’s concentration increase of 6 at the 3rd quantile. From this there is a decrease in the concentration of PM 2.5 from 2004 to 2019 overall in the state of California.

#County Level, Los Angeles County
countyLA <- join %>%
  filter(COUNTY == 'Los Angeles')
head(countyLA)
##          Date Source  Site ID POC PM2.5    UNITS DAILY_AQI_VALUE SiteName
## 1: 2004-01-01    AQS 60370002   1  18.0 ug/m3 LC              63    Azusa
## 2: 2004-01-02    AQS 60370002   1  20.4 ug/m3 LC              68    Azusa
## 3: 2004-01-03    AQS 60370002   1   8.0 ug/m3 LC              33    Azusa
## 4: 2004-01-07    AQS 60370002   1  23.6 ug/m3 LC              75    Azusa
## 5: 2004-01-08    AQS 60370002   1  28.3 ug/m3 LC              85    Azusa
## 6: 2004-01-09    AQS 60370002   1  21.9 ug/m3 LC              72    Azusa
##    DAILY_OBS_COUNT PERCENT_COMPLETE AQS_PARAMETER_CODE       AQS_PARAMETER_DESC
## 1:               1              100              88101 PM2.5 - Local Conditions
## 2:               1              100              88101 PM2.5 - Local Conditions
## 3:               1              100              88101 PM2.5 - Local Conditions
## 4:               1              100              88101 PM2.5 - Local Conditions
## 5:               1              100              88101 PM2.5 - Local Conditions
## 6:               1              100              88101 PM2.5 - Local Conditions
##    CBSA_CODE                          CBSA_NAME STATE_CODE      STATE
## 1:     31080 Los Angeles-Long Beach-Anaheim, CA          6 California
## 2:     31080 Los Angeles-Long Beach-Anaheim, CA          6 California
## 3:     31080 Los Angeles-Long Beach-Anaheim, CA          6 California
## 4:     31080 Los Angeles-Long Beach-Anaheim, CA          6 California
## 5:     31080 Los Angeles-Long Beach-Anaheim, CA          6 California
## 6:     31080 Los Angeles-Long Beach-Anaheim, CA          6 California
##    COUNTY_CODE      COUNTY     lat       lon Year
## 1:          37 Los Angeles 34.1365 -117.9239 2004
## 2:          37 Los Angeles 34.1365 -117.9239 2004
## 3:          37 Los Angeles 34.1365 -117.9239 2004
## 4:          37 Los Angeles 34.1365 -117.9239 2004
## 5:          37 Los Angeles 34.1365 -117.9239 2004
## 6:          37 Los Angeles 34.1365 -117.9239 2004
ggplot(countyLA, aes(COUNTY, PM2.5, fill = factor(Year)))+
  geom_boxplot()+
  labs(title = 'Boxplot of PM 2.5 concentrations in LA county in 2004 and 2019', x = 'County'
       , fill = 'Year')

Comparing the PM 2.5 concentrations in LA county between 2004 and 2019 there are higher concentrations within 2004 as their quantiles are greater than the quantiles in 2019. However, in 2019 has the more extranoues outliers with the greatest being 120.

#County Level, Los Angeles County
ggplot(subset(countyLA, Year == 2004), aes(x = Date, y = PM2.5))+
  geom_line()+
  labs(title = 'Time Series of PM 2.5 in LA county for 2004')

ggplot(subset(countyLA, Year == 2019), aes(x = Date, y = PM2.5))+
  geom_line()+
  labs(title = 'Time Series of PM 2.5 in LA county for 2019')

From our time series visual we see a closer look of the concentrations between 2004 and 2019 as in 2004 the range is just above 60 compared to 2019’s range being mostly under a concentration just under 25. Both years follow a cyclical pattern but 2004 has the more growing spikes of PM 2.5. The two outliers in 2019 distort our visual as the spread of the concentration is rather consistant and much lower than the concentrations in 2004.

#County Level, Los Angeles County
countyLA %>%
  group_by(Year)%>%
  summarise(min = min(PM2.5), mean = mean(PM2.5), max = max(PM2.5),
            quantile = quantile(PM2.5, c(0.25, 0.50, 0.75)), q = c(0.25, 0.50, 0.75))
## # A tibble: 6 x 6
## # Groups:   Year [2]
##    Year   min  mean   max quantile     q
##   <dbl> <dbl> <dbl> <dbl>    <dbl> <dbl>
## 1  2004   0.1  17.1  75.6     10.5  0.25
## 2  2004   0.1  17.1  75.6     14.7  0.5 
## 3  2004   0.1  17.1  75.6     20.4  0.75
## 4  2019  -0.5  10.2 121.       6.4  0.25
## 5  2019  -0.5  10.2 121.       9.5  0.5 
## 6  2019  -0.5  10.2 121.      12.9  0.75

From our summary statistics we se that in 2004 the max concentration is much lower than 2019’s maximum but 2004 has the greatest range within its quantiles. From this there is a decrease in PM 2.5 concentrations from 2004 to 2019 at the Los Angeles county level.

#Site Level, Los Angeles
site <- join %>%
  filter(SiteName == "Los Angeles-North Main Street")
head(site, 12)
##           Date Source  Site ID POC PM2.5    UNITS DAILY_AQI_VALUE
##  1: 2004-01-01    AQS 60371103   1  42.1 ug/m3 LC             117
##  2: 2004-01-02    AQS 60371103   1  25.3 ug/m3 LC              79
##  3: 2004-01-03    AQS 60371103   1   4.8 ug/m3 LC              20
##  4: 2004-01-07    AQS 60371103   1  28.1 ug/m3 LC              85
##  5: 2004-01-08    AQS 60371103   1  36.2 ug/m3 LC             103
##  6: 2004-01-09    AQS 60371103   1  29.6 ug/m3 LC              88
##  7: 2004-01-10    AQS 60371103   1   8.9 ug/m3 LC              37
##  8: 2004-01-11    AQS 60371103   1  15.2 ug/m3 LC              58
##  9: 2004-01-12    AQS 60371103   1  49.7 ug/m3 LC             136
## 10: 2004-01-13    AQS 60371103   1  19.9 ug/m3 LC              67
## 11: 2004-01-15    AQS 60371103   1  20.4 ug/m3 LC              68
## 12: 2004-01-16    AQS 60371103   1  36.6 ug/m3 LC             104
##                          SiteName DAILY_OBS_COUNT PERCENT_COMPLETE
##  1: Los Angeles-North Main Street               1              100
##  2: Los Angeles-North Main Street               1              100
##  3: Los Angeles-North Main Street               1              100
##  4: Los Angeles-North Main Street               1              100
##  5: Los Angeles-North Main Street               1              100
##  6: Los Angeles-North Main Street               1              100
##  7: Los Angeles-North Main Street               1              100
##  8: Los Angeles-North Main Street               1              100
##  9: Los Angeles-North Main Street               1              100
## 10: Los Angeles-North Main Street               1              100
## 11: Los Angeles-North Main Street               1              100
## 12: Los Angeles-North Main Street               1              100
##     AQS_PARAMETER_CODE       AQS_PARAMETER_DESC CBSA_CODE
##  1:              88101 PM2.5 - Local Conditions     31080
##  2:              88101 PM2.5 - Local Conditions     31080
##  3:              88101 PM2.5 - Local Conditions     31080
##  4:              88101 PM2.5 - Local Conditions     31080
##  5:              88101 PM2.5 - Local Conditions     31080
##  6:              88101 PM2.5 - Local Conditions     31080
##  7:              88101 PM2.5 - Local Conditions     31080
##  8:              88101 PM2.5 - Local Conditions     31080
##  9:              88101 PM2.5 - Local Conditions     31080
## 10:              88101 PM2.5 - Local Conditions     31080
## 11:              88101 PM2.5 - Local Conditions     31080
## 12:              88101 PM2.5 - Local Conditions     31080
##                              CBSA_NAME STATE_CODE      STATE COUNTY_CODE
##  1: Los Angeles-Long Beach-Anaheim, CA          6 California          37
##  2: Los Angeles-Long Beach-Anaheim, CA          6 California          37
##  3: Los Angeles-Long Beach-Anaheim, CA          6 California          37
##  4: Los Angeles-Long Beach-Anaheim, CA          6 California          37
##  5: Los Angeles-Long Beach-Anaheim, CA          6 California          37
##  6: Los Angeles-Long Beach-Anaheim, CA          6 California          37
##  7: Los Angeles-Long Beach-Anaheim, CA          6 California          37
##  8: Los Angeles-Long Beach-Anaheim, CA          6 California          37
##  9: Los Angeles-Long Beach-Anaheim, CA          6 California          37
## 10: Los Angeles-Long Beach-Anaheim, CA          6 California          37
## 11: Los Angeles-Long Beach-Anaheim, CA          6 California          37
## 12: Los Angeles-Long Beach-Anaheim, CA          6 California          37
##          COUNTY      lat       lon Year
##  1: Los Angeles 34.06659 -118.2269 2004
##  2: Los Angeles 34.06659 -118.2269 2004
##  3: Los Angeles 34.06659 -118.2269 2004
##  4: Los Angeles 34.06659 -118.2269 2004
##  5: Los Angeles 34.06659 -118.2269 2004
##  6: Los Angeles 34.06659 -118.2269 2004
##  7: Los Angeles 34.06659 -118.2269 2004
##  8: Los Angeles 34.06659 -118.2269 2004
##  9: Los Angeles 34.06659 -118.2269 2004
## 10: Los Angeles 34.06659 -118.2269 2004
## 11: Los Angeles 34.06659 -118.2269 2004
## 12: Los Angeles 34.06659 -118.2269 2004
ggplot(site, aes(SiteName, PM2.5, fill = factor(Year)))+
  geom_boxplot()+
  labs(title = 'Boxplot of PM 2.5 concentrations in LA site in 2004 and 2019', x = 'Site Name',
       fill = 'Year')

With out boxplot for the LA site we see that the concentration for PM 2.5 has greater values in 2004 as the quatiles are significantly greater than the quantiles in 2019. In 2004 there are also more extraneous outliers with the greatest being aorund 75.

#Site Level, Los Angeles
ggplot(subset(site, Year == 2004), aes(x = Date, y = PM2.5))+
  geom_line()+
  labs(title = 'Time Series of PM 2.5 in LA site for 2004')

ggplot(subset(site, Year == 2019), aes(x = Date, y = PM2.5))+
  geom_line()+
  labs(title = 'Time Series of PM 2.5 in LA site for 2019')

In our time series visual we see that the PM 2.5 concentration in 2004 is well spread with the highest peak happening in March and then again in October. Both years follow a cyclical pattern, but in 2019 has a tighter cyclical pattern with sharper lines meaning the PM 2.5 concentration changes rapidly in each day in the LA site for 2019. The greatest peak in 2019 happens in September being greater than 43.

#Site Level, Los Angeles
join %>%
  filter(SiteName == "Los Angeles-North Main Street")%>%
  group_by(Year)%>%
  summarise(min = min(PM2.5), mean = mean(PM2.5), max = max(PM2.5),
            quantile = quantile(PM2.5, c(0.25, 0.50, 0.75)), q = c(0.25, 0.50, 0.75))  
## # A tibble: 6 x 6
## # Groups:   Year [2]
##    Year   min  mean   max quantile     q
##   <dbl> <dbl> <dbl> <dbl>    <dbl> <dbl>
## 1  2004   2    20.1  75       12.8  0.25
## 2  2004   2    20.1  75       16.8  0.5 
## 3  2004   2    20.1  75       23.4  0.75
## 4  2019   1.9  11.7  43.5      7.9  0.25
## 5  2019   1.9  11.7  43.5     10.9  0.5 
## 6  2019   1.9  11.7  43.5     14.5  0.75

From our summary statistics we see that 2004 has the greatest range for its concentration quantile is much greater than 2019’s. We see that there is a decrease of the concentration of PM 2.5 from 2004 to 2019 at the Los Angeles site level.